##Packages

#install.packages('tidyverse')
#install.packages('dplyr')
#install.packages('car')
#install.packages("DescTools")
#install.packages("arules")
#install.packages("Rtools")
#install.packages("stats")
#install.packages("forecast")
#install.packages("TSstudio")
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(DescTools)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:arules':
## 
##     intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:DescTools':
## 
##     Recode
## The following object is masked from 'package:arules':
## 
##     recode
library(stats)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:arules':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:DescTools':
## 
##     BoxCox
library(tseries)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
library(TSstudio)
cancer_incidence<- read.csv(file="C:/Users/olivi/OneDrive/Documents/DataSets/13100109-eng/13100109.csv", sep = ",")
#first remove columns from cancer incidence frame
##############################################################
cancer_table<- subset(cancer_incidence, select = -c(DGUID,UOM_ID,UOM,SCALAR_FACTOR,SCALAR_ID, VECTOR, COORDINATE, STATUS, SYMBOL, TERMINATED, DECIMALS))

#now remove all rows that contain the characteristic value that we don't want to measure

cancer_cl<-cancer_table[-grep("Statistically different from", cancer_table$Characteristics),]
cancer_cl<- cancer_cl[grep("age-standardized", cancer_cl$Characteristics),]
cancer_cl<-cancer_cl[-grep("95%", cancer_cl$Characteristics),]
#cancer_cl<-cancer_cl[grep("Both", cancer_cl$Sex),]
#cancer_cl<-cancer_cl[grep("All invasive", cancer_cl$Selected.sites.of.cancer..ICD.O.3.),]


#try spread
#remove duplicated elements so spread works
cancer_w<-cancer_cl[!duplicated(cancer_cl),]
cancer_w4<-spread(cancer_w, Characteristics, VALUE)
cancer_w4<- cancer_w4[-grep("Canada", cancer_w4$GEO),]
cancer_w4<- cancer_w4[-grep("Peer", cancer_w4$GEO),]
n_distinct(cancer_cl$GEO)
## [1] 154
##154

summary(cancer_w4)
##  ï..REF_DATE            GEO                Sex           
##  Length:19162       Length:19162       Length:19162      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Selected.sites.of.cancer..ICD.O.3.
##  Length:19162                      
##  Class :character                  
##  Mode  :character                  
##                                    
##                                    
##                                    
##                                    
##  Cancer incidence (age-standardized rate per 100,000 population)
##  Min.   :   0.0                                                 
##  1st Qu.:  68.6                                                 
##  Median :  97.3                                                 
##  Mean   : 211.2                                                 
##  3rd Qu.: 439.5                                                 
##  Max.   :1295.7                                                 
##  NA's   :1049
str(cancer_w4)
## 'data.frame':    19162 obs. of  5 variables:
##  $ ï..REF_DATE                                                    : chr  "2001/2003" "2001/2003" "2001/2003" "2001/2003" ...
##  $ GEO                                                            : chr  "Alberta" "Alberta" "Alberta" "Alberta" ...
##  $ Sex                                                            : chr  "Both sexes" "Both sexes" "Both sexes" "Females" ...
##  $ Selected.sites.of.cancer..ICD.O.3.                             : chr  "All invasive primary cancer sites (including in situ bladder)" "Bronchus and lung cancer [C34.0-C34.9]" "Colon, rectum and rectosigmoid junction cancer [C18.0-C18.9, C19.9, C20.9, C26.0]" "All invasive primary cancer sites (including in situ bladder)" ...
##  $ Cancer incidence (age-standardized rate per 100,000 population): num  543.8 68.9 63.2 467.9 57.7 ...
hist(cancer_w4$`Cancer incidence (age-standardized rate per 100,000 population)`)

cancer_2<-cancer_table[-grep("Statistically different from", cancer_table$Characteristics),]
cancer_2<-cancer_2[-grep("95%", cancer_2$Characteristics),]
cancer_2<-cancer_2[grep("age-standardized", cancer_2$Characteristics),]
cancer_2<-cancer_2[-grep("Canada", cancer_2$GEO),]
cancer_2<-cancer_2[-grep("Peer", cancer_2$GEO),]
cancer_2<- cancer_2[!duplicated(cancer_2),]


cancer_2.1<- spread(cancer_2, Characteristics, VALUE)

cancer_2.2<- cancer_2.1[,1:5]
cancer_2.2<- spread(cancer_2.2, Selected.sites.of.cancer..ICD.O.3., 'Cancer incidence (age-standardized rate per 100,000 population)')


cancer_2.2<-as.data.frame(unclass(cancer_2.2), stringsAsFactors = TRUE)
#levels(cancer_2.2$GEO)
hist(cancer_2.2[,4], breaks = 50, main = "All cancer sites")

hist(cancer_2.2[,5], breaks = 50, main = "Bronchus and lung cancer")

hist(cancer_2.2[,6], breaks = 50, main = "Colon cancer")

hist(cancer_2.2[,7], breaks = 50, main = "Female breast cancer")

hist(cancer_2.2[,8], breaks = 25, main = "Prostate cancer")

cancer_2.3<- cancer_2.2[,1:3]

cancer_2.3["Quantile_All"]<- CutQ(cancer_2.2[,4], breaks = 6, na.rm=FALSE)
cancer_2.3["Quantile_bronc"]<- CutQ(cancer_2.2[,5], breaks = 6, na.rm=FALSE)
cancer_2.3["Quantile_colon"]<- CutQ(cancer_2.2[,6], breaks = 6, na.rm=FALSE)
cancer_2.3["Quantile_breast"]<- CutQ(cancer_2.2[,7],breaks = 6, na.rm=FALSE)
cancer_2.3["Quantile_prost"]<- CutQ(cancer_2.2[,8], breaks = 6, na.rm =FALSE)
##not sure if apriori would work with this data, haven't figured it out yet
#when I had results none were pertaining to geography

geo<- list(cancer_2.3$GEO)

#assco_rules<- apriori(cancer_2.3, parameter = list(supp = 0.01, conf = 0.05), appearance = list(lhs=geo))
#summary(assco_rules)
#inspect(assco_rules)
#######SPLIT NEW TABLE INTO GEOGRAPHIC REGIONS
##with quantiles
cancer_ON.3<- cancer_2.3[grep("Ontario", cancer_2.3$GEO),]

cancer_QC.3<- cancer_2.3[grep("Quebec", cancer_2.3$GEO),]
cancer_BC.3<- cancer_2.3[grep("British Columbia", cancer_2.3$GEO),]

cancer_MB.3<- cancer_2.3[grep("Manitoba", cancer_2.3$GEO),]
cancer_SK.3<- cancer_2.3[grep("Saskatchewan", cancer_2.3$GEO),]
cancer_AB.3<- cancer_2.3[grep("Alberta", cancer_2.3$GEO),]
cancer_midwest.3<- rbind(cancer_MB.3,cancer_AB.3,cancer_SK.3)

cancer_NL.3<- cancer_2.3[grep("Newfoundland", cancer_2.3$GEO),]
cancer_NB.3<- cancer_2.3[grep("New Brunswick", cancer_2.3$GEO),]
cancer_NS.3<- cancer_2.3[grep("Nova Scotia", cancer_2.3$GEO),]
cancer_PEI.3<- cancer_2.3[grep("Prince Edward Island",cancer_2.3$GEO),]
cancer_maritimes.3<- rbind(cancer_NS.3,cancer_NB.3,cancer_NL.3,cancer_PEI.3)

cancer_YT.3<- cancer_2.3[grep("Yukon", cancer_2.3$GEO),]
cancer_NWT.3<-cancer_2.3[grep("Northwest Territories", cancer_2.3$GEO),]
cancer_NVT.3<-cancer_2.3[grep("Nunavut", cancer_2.3$GEO),]
cancer_territories.3<- rbind(cancer_NWT.3,cancer_YT.3,cancer_NVT.3)


##continuous
cancer_ON.2<- cancer_2.2[grep("Ontario", cancer_2.2$GEO),]
cancer_QC.2<- cancer_2.2[grep("Quebec", cancer_2.2$GEO),]
cancer_BC.2<- cancer_2.2[grep("British Columbia", cancer_2.2$GEO),]

cancer_MB.2<- cancer_2.2[grep("Manitoba", cancer_2.2$GEO),]
cancer_SK.2<- cancer_2.2[grep("Saskatchewan", cancer_2.2$GEO),]
cancer_AB.2<- cancer_2.2[grep("Alberta", cancer_2.2$GEO),]
cancer_midwest.2<- rbind(cancer_MB.2,cancer_AB.2,cancer_SK.2)

cancer_NL.2<- cancer_2.2[grep("Newfoundland", cancer_2.2$GEO),]
cancer_NB.2<- cancer_2.2[grep("New Brunswick", cancer_2.2$GEO),]
cancer_NS.2<- cancer_2.2[grep("Nova Scotia", cancer_2.2$GEO),]
cancer_PEI.2<- cancer_2.2[grep("Prince Edward Island",cancer_2.2$GEO),]
cancer_maritimes.2<- rbind(cancer_NS.2,cancer_NB.2,cancer_NL.2,cancer_PEI.2)

cancer_YT.2<- cancer_2.2[grep("Yukon", cancer_2.2$GEO),]
cancer_NWT.2<-cancer_2.2[grep("Northwest Territories", cancer_2.2$GEO),]
cancer_NVT.2<-cancer_2.2[grep("Nunavut", cancer_2.2$GEO),]
cancer_territories.2<- rbind(cancer_NWT.2,cancer_YT.2,cancer_NVT.2)
##try time series analysis
###Just a peek

ts_ON.2<-ts(data = cancer_ON.2, frequency = n_distinct(cancer_ON.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_ON.2)

ts_QC.2<-ts(data = cancer_QC.2, frequency = n_distinct(cancer_QC.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_QC.2)

ts_BC.2<-ts(data = cancer_BC.2, frequency = n_distinct(cancer_BC.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_BC.2)

ts_midwest.2<-ts(data = cancer_midwest.2, frequency = n_distinct(cancer_midwest.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_midwest.2)

ts_maritimes.2<-ts(data = cancer_maritimes.2,frequency = n_distinct(cancer_maritimes.2$GEO) , start = 2001, end = 2015)
ts.plot(ts_maritimes.2)

ts_territories.2<-ts(data = cancer_territories.2, frequency = n_distinct(cancer_territories.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_territories.2)

class(ts_BC.2)
## [1] "mts"    "ts"     "matrix"
####SO the frequency used is the number of distinct values in the GEO feature. 
####This is because this is the "interval" that repeats over time.
##"Argument frequency indicates the sampling frequency of the time series, with the default value 1 indicating one sample in each unit time interval."
plot(decompose(ts_ON.2[,4]))

plot(decompose(ts_ON.2[,5]))

plot(decompose(ts_ON.2[,6]))

#plot(decompose(ts_ON.2[,7], na.action(na.pass)))
#plot(decompose(ts_ON.2[,8],na.action(na.pass)))

plot(decompose(ts_QC.2[,4]))

plot(decompose(ts_QC.2[,5]))

plot(decompose(ts_QC.2[,6]))

plot(decompose(ts_BC.2[,4]))

plot(decompose(ts_BC.2[,5]))

plot(decompose(ts_BC.2[,6]))

plot(decompose(ts_midwest.2[,4]))##residuals are not white noise

plot(decompose(ts_midwest.2[,5]))##residuals are not white noise

plot(decompose(ts_midwest.2[,6]))##residuals are not white noise

plot(decompose(ts_maritimes.2[,4]))

plot(decompose(ts_maritimes.2[,5]))##residuals are not white noise

plot(decompose(ts_maritimes.2[,6]))

plot(decompose(ts_territories.2[,4]))

plot(decompose(ts_territories.2[,5]))

plot(decompose(ts_territories.2[,6]))

##All cancer

ts.plot(ts_ON.2[,4], main = "All sites of cancer")

ts.plot(ts_QC.2[,4], main = "All sites of cancer")

ts.plot(ts_BC.2[,4], main = "All sites of cancer")

ts.plot(ts_midwest.2[,4], main = "All sites of cancer")

ts.plot(ts_maritimes.2[,4], main = "All sites of cancer")

ts.plot(ts_territories.2[,4], main = "All sites of cancer")

#Lung

ts.plot(ts_ON.2[,5], main="Bronchus and lung cancer")

ts.plot(ts_QC.2[,5], main="Bronchus and lung cancer")

ts.plot(ts_BC.2[,5], main="Bronchus and lung cancer")

ts.plot(ts_midwest.2[,5], main="Bronchus and lung cancer")

ts.plot(ts_maritimes.2[,5], main="Bronchus and lung cancer")

ts.plot(ts_territories.2[,5], main="Bronchus and lung cancer")

#Colon

ts.plot(ts_ON.2[,6], main="Colon cancer")

ts.plot(ts_QC.2[,6], main="Colon cancer")

ts.plot(ts_BC.2[,6], main="Colon cancer")

ts.plot(ts_midwest.2[,6], main="Colon cancer")

ts.plot(ts_maritimes.2[,6], main="Colon cancer")

ts.plot(ts_territories.2[,6], main="Colon cancer")

###check ACF to determine what predictive model to use
#Ontario
acf(ts_ON.2[,4], type = "correlation")

acf(ts_ON.2[,4], type= "partial")

acf(ts_ON.2[,5], type = "correlation")

acf(ts_ON.2[,5], type= "partial")

acf(ts_ON.2[,6], type = "correlation")

acf(ts_ON.2[,6], type= "partial")

#Quebec
acf(ts_QC.2[,4], type = "correlation")

acf(ts_QC.2[,4], type= "partial")

acf(ts_QC.2[,5], type = "correlation")

acf(ts_QC.2[,5], type= "partial")

acf(ts_QC.2[,6], type = "correlation")

acf(ts_QC.2[,6], type= "partial")

#BRITISH COLUMBIA
acf(ts_BC.2[,4], type = "correlation")

acf(ts_BC.2[,4], type= "partial")

acf(ts_BC.2[,5], type = "correlation")

acf(ts_BC.2[,5], type= "partial")

acf(ts_BC.2[,6], type = "correlation")

acf(ts_BC.2[,6], type= "partial")

##MIDWEST
acf(ts_midwest.2[,4], type = "correlation")

acf(ts_midwest.2[,4], type= "partial")

acf(ts_midwest.2[,5], type = "correlation")

acf(ts_midwest.2[,5], type= "partial")

acf(ts_midwest.2[,6], type = "correlation")

acf(ts_midwest.2[,6], type= "partial")

#Maritimes
acf(ts_maritimes.2[,4], type = "correlation")

acf(ts_maritimes.2[,4], type= "partial")

acf(ts_maritimes.2[,5], type = "correlation")

acf(ts_maritimes.2[,5], type= "partial")

acf(ts_maritimes.2[,6], type = "correlation")

acf(ts_maritimes.2[,6], type= "partial")

#territories
acf(ts_territories.2[,4], type = "correlation")

acf(ts_territories.2[,4], type= "partial")

acf(ts_territories.2[,5], type = "correlation")

acf(ts_territories.2[,5], type= "partial")

acf(ts_territories.2[,6], type = "correlation")

acf(ts_territories.2[,6], type= "partial")

#####
###doesn't look like any of them are sufficiently stationary
##makes me want to use ARIMA
##time series regression
###see decomposition
##tests for stationarity
#ljung-box - non-stationary will have low p-value
##other tests indicated a lag of 6 so we will see how this changes the ljung-box test
Box.test(ts_ON.2[,4], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_ON.2[, 4]
## X-squared = 41.804, df = 1, p-value = 1.009e-10
Box.test(ts_ON.2[,5], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_ON.2[, 5]
## X-squared = 20.283, df = 1, p-value = 6.678e-06
Box.test(ts_ON.2[,6], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_ON.2[, 6]
## X-squared = 11.866, df = 1, p-value = 0.0005718
Box.test(ts_QC.2[,4], type="Ljung-Box")##fail to reject null of stationarity
## 
##  Box-Ljung test
## 
## data:  ts_QC.2[, 4]
## X-squared = 3.5323, df = 1, p-value = 0.06018
Box.test(ts_QC.2[,5], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_QC.2[, 5]
## X-squared = 31.127, df = 1, p-value = 2.417e-08
Box.test(ts_QC.2[,6], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_QC.2[, 6]
## X-squared = 31.829, df = 1, p-value = 1.684e-08
Box.test(ts_BC.2[,4], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_BC.2[, 4]
## X-squared = 14.29, df = 1, p-value = 0.0001567
Box.test(ts_BC.2[,5], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_BC.2[, 5]
## X-squared = 6.0415, df = 1, p-value = 0.01397
Box.test(ts_BC.2[,6], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_BC.2[, 6]
## X-squared = 7.7952, df = 1, p-value = 0.005239
Box.test(ts_midwest.2[,4], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_midwest.2[, 4]
## X-squared = 32.01, df = 1, p-value = 1.534e-08
Box.test(ts_midwest.2[,5], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_midwest.2[, 5]
## X-squared = 13.422, df = 1, p-value = 0.0002486
Box.test(ts_midwest.2[,6], type="Ljung-Box")##fail to reject null of stationarity
## 
##  Box-Ljung test
## 
## data:  ts_midwest.2[, 6]
## X-squared = 2.085, df = 1, p-value = 0.1487
Box.test(ts_maritimes.2[,4], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_maritimes.2[, 4]
## X-squared = 34.233, df = 1, p-value = 4.889e-09
Box.test(ts_maritimes.2[,5], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_maritimes.2[, 5]
## X-squared = 41.948, df = 1, p-value = 9.374e-11
Box.test(ts_maritimes.2[,6], type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts_maritimes.2[, 6]
## X-squared = 8.2406, df = 1, p-value = 0.004096
Box.test(ts_territories.2[,4], type="Ljung-Box")##fail to reject null of stationarity
## 
##  Box-Ljung test
## 
## data:  ts_territories.2[, 4]
## X-squared = 0.19385, df = 1, p-value = 0.6597
Box.test(ts_territories.2[,5], type="Ljung-Box")##fail to reject null of stationarity
## 
##  Box-Ljung test
## 
## data:  ts_territories.2[, 5]
## X-squared = 1.9755, df = 1, p-value = 0.1599
Box.test(ts_territories.2[,6], type="Ljung-Box")##fail to reject null of stationarity
## 
##  Box-Ljung test
## 
## data:  ts_territories.2[, 6]
## X-squared = 8.0948e-06, df = 1, p-value = 0.9977
#generally reject the null that these are stationary time series

#stationary might indicate that there is an equally likely 
#chance of cancer diagnosis regardless of location
#Augmented dickey-fuller t-stat test for unit root
options(warn = -1)
#non-stationary has a large p-value
adf.test(ts_ON.2[,4])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_ON.2[, 4]
## Dickey-Fuller = -8.561, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_ON.2[,5])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_ON.2[, 5]
## Dickey-Fuller = -7.9376, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_ON.2[,6])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_ON.2[, 6]
## Dickey-Fuller = -7.7782, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_QC.2[,4])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_QC.2[, 4]
## Dickey-Fuller = -6.9374, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_QC.2[,5])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_QC.2[, 5]
## Dickey-Fuller = -7.5649, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_QC.2[,6])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_QC.2[, 6]
## Dickey-Fuller = -4.9935, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_BC.2[,4])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_BC.2[, 4]
## Dickey-Fuller = -5.5942, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_BC.2[,5])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_BC.2[, 5]
## Dickey-Fuller = -5.1055, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_BC.2[,6])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_BC.2[, 6]
## Dickey-Fuller = -5.318, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_midwest.2[,4])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_midwest.2[, 4]
## Dickey-Fuller = -7.0176, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_midwest.2[,5])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_midwest.2[, 5]
## Dickey-Fuller = -7.0317, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_midwest.2[,6])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_midwest.2[, 6]
## Dickey-Fuller = -6.2022, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_maritimes.2[,4])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_maritimes.2[, 4]
## Dickey-Fuller = -5.9679, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_maritimes.2[,5])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_maritimes.2[, 5]
## Dickey-Fuller = -6.2437, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_maritimes.2[,6])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_maritimes.2[, 6]
## Dickey-Fuller = -6.7881, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_territories.2[,4])##fail to reject null of non-stationarity
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_territories.2[, 4]
## Dickey-Fuller = -2.4322, Lag order = 3, p-value = 0.403
## alternative hypothesis: stationary
adf.test(ts_territories.2[,5])##fail to reject null of non-stationarity
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_territories.2[, 5]
## Dickey-Fuller = -2.6982, Lag order = 3, p-value = 0.2979
## alternative hypothesis: stationary
adf.test(ts_territories.2[,6])##fail to reject null of non-stationarity
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_territories.2[, 6]
## Dickey-Fuller = -2.1182, Lag order = 3, p-value = 0.527
## alternative hypothesis: stationary
##reject the null of non-stationarity for all except ^^
##KPSS for level or trend stationarity
options(warn = -1)
## lower p-value indicates non stationarity
kpss.test(ts_ON.2[,4], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_ON.2[, 4]
## KPSS Trend = 0.03125, Truncation lag parameter = 6, p-value = 0.1
kpss.test(ts_ON.2[,5], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_ON.2[, 5]
## KPSS Trend = 0.02115, Truncation lag parameter = 6, p-value = 0.1
kpss.test(ts_ON.2[,6], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_ON.2[, 6]
## KPSS Trend = 0.045779, Truncation lag parameter = 6, p-value = 0.1
kpss.test(ts_QC.2[,4], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_QC.2[, 4]
## KPSS Trend = 0.027699, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_QC.2[,5], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_QC.2[, 5]
## KPSS Trend = 0.030488, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_QC.2[,6], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_QC.2[, 6]
## KPSS Trend = 0.043577, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_BC.2[,4], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_BC.2[, 4]
## KPSS Trend = 0.094938, Truncation lag parameter = 4, p-value = 0.1
kpss.test(ts_BC.2[,5], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_BC.2[, 5]
## KPSS Trend = 0.033942, Truncation lag parameter = 4, p-value = 0.1
kpss.test(ts_BC.2[,6], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_BC.2[, 6]
## KPSS Trend = 0.04278, Truncation lag parameter = 4, p-value = 0.1
kpss.test(ts_midwest.2[,4], null = "Trend")##reject the null that this is level stationary
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_midwest.2[, 4]
## KPSS Trend = 0.096673, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_midwest.2[,5], null = "Trend")##reject the null that this is level stationary
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_midwest.2[, 5]
## KPSS Trend = 0.048228, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_midwest.2[,6], null = "Trend")##reject the null that this is level/trend stationary
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_midwest.2[, 6]
## KPSS Trend = 0.3926, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_maritimes.2[,4], null = "Trend")##reject the null that this is level/trend stationary
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_maritimes.2[, 4]
## KPSS Trend = 0.16538, Truncation lag parameter = 5, p-value = 0.03385
kpss.test(ts_maritimes.2[,5], null = "Trend")##reject the null that this is trend stationary
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_maritimes.2[, 5]
## KPSS Trend = 0.3217, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_maritimes.2[,6], null = "Trend")##reject the null that this is level/trend stationary
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_maritimes.2[, 6]
## KPSS Trend = 0.1833, Truncation lag parameter = 5, p-value = 0.02226
kpss.test(ts_territories.2[,4], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_territories.2[, 4]
## KPSS Trend = 0.097352, Truncation lag parameter = 3, p-value = 0.1
kpss.test(ts_territories.2[,5], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_territories.2[, 5]
## KPSS Trend = 0.075875, Truncation lag parameter = 3, p-value = 0.1
kpss.test(ts_territories.2[,6], null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_territories.2[, 6]
## KPSS Trend = 0.065017, Truncation lag parameter = 3, p-value = 0.1
##cannot reject the null of stationarity, p-value indicates they could be stationary 
split.ON<-ts_split(ts.obj=ts_ON.2, sample.out =104)
train.ON<- split.ON$train
test.ON<- split.ON$test

split.QC<- ts_split(ts_QC.2, sample.out = 38)
train.QC<- split.QC$train
test.QC<- split.QC$test

split.BC<- ts_split(ts_BC.2, sample.out = 34)
train.BC<- split.BC$train
test.BC<- split.BC$test

split.mid<- ts_split(ts_midwest.2, sample.out =48)
train.mid<- split.mid$train
test.mid<- split.mid$test

split.mar<- ts_split(ts_maritimes.2, sample.out = 38)
train.mar<- split.mar$train
test.mar<- split.mar$test

split.terr<- ts_split(ts_territories.2, sample.out = 6)
train.terr<-split.terr$train
test.terr<-split.terr$test
#TSLM
#need to train models
##time series regression [tsr]-> tslm

tsr_ON<- tslm(formula= train.ON[,4]~GEO+Sex,data=train.ON)
summary(tsr_ON)
## 
## Call:
## tslm(formula = train.ON[, 4] ~ GEO + Sex, data = train.ON)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -177.08  -56.27   14.43   48.32  181.35 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 431.36473    7.38936  58.377   <2e-16 ***
## GEO           0.13625    0.06517   2.091    0.037 *  
## Sex          42.49623    2.98760  14.224   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61.01 on 622 degrees of freedom
## Multiple R-squared:  0.2496, Adjusted R-squared:  0.2472 
## F-statistic: 103.4 on 2 and 622 DF,  p-value: < 2.2e-16
tsr_ON.5<- tslm(formula= train.ON[,5]~GEO+Sex,data=train.ON)
summary(tsr_ON.5)
## 
## Call:
## tslm(formula = train.ON[, 5] ~ GEO + Sex, data = train.ON)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.956 -10.918  -0.237  11.334 109.730 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 54.58994    2.06089  26.489   <2e-16 ***
## GEO          0.03397    0.01818   1.869   0.0621 .  
## Sex          7.98038    0.83324   9.578   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.02 on 622 degrees of freedom
## Multiple R-squared:  0.1329, Adjusted R-squared:  0.1301 
## F-statistic: 47.66 on 2 and 622 DF,  p-value: < 2.2e-16
tsr_ON.6<- tslm(formula= train.ON[,6]~GEO+Sex,data=train.ON)
summary(tsr_ON.6)
## 
## Call:
## tslm(formula = train.ON[, 6] ~ GEO + Sex, data = train.ON)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.811 -10.100   1.616   7.415  51.972 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 52.45088    1.47359  35.594  < 2e-16 ***
## GEO          0.03957    0.01300   3.045  0.00243 ** 
## Sex          6.90200    0.59579  11.585  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.17 on 622 degrees of freedom
## Multiple R-squared:  0.1876, Adjusted R-squared:  0.185 
## F-statistic: 71.84 on 2 and 622 DF,  p-value: < 2.2e-16
##tsr_ON.7 <- tslm(formula= train.ON[,7]~GEO, data=train.ON)
##summary(tsr_ON.7)
##tsr_ON.8<- tslm(formula= train.ON[,8]~GEO, data=train.ON)
##summary(tsr_ON.8)

tsr_QC<- tslm(formula = train.QC[,4]~GEO+Sex, data = train.QC)
summary(tsr_QC)
## 
## Call:
## tslm(formula = train.QC[, 4] ~ GEO + Sex, data = train.QC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -244.95  -85.53   12.03   50.62  678.27 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  117.039    111.360   1.051 0.294382    
## GEO            4.239      1.265   3.352 0.000941 ***
## Sex           48.832      8.506   5.741 3.02e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 105.2 on 226 degrees of freedom
## Multiple R-squared:  0.1646, Adjusted R-squared:  0.1572 
## F-statistic: 22.26 on 2 and 226 DF,  p-value: 1.494e-09
tsr_QC.5<- tslm(formula= train.QC[,5]~GEO+Sex, data=train.QC)
summary(tsr_QC.5)
## 
## Call:
## tslm(formula = train.QC[, 5] ~ GEO + Sex, data = train.QC)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -125.763  -30.540   -2.385   19.302  311.667 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -105.5370    55.1246  -1.915  0.05682 .  
## GEO            2.1234     0.6261   3.392  0.00082 ***
## Sex           17.9715     4.2108   4.268  2.9e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52.08 on 226 degrees of freedom
## Multiple R-squared:  0.1171, Adjusted R-squared:  0.1093 
## F-statistic: 14.99 on 2 and 226 DF,  p-value: 7.733e-07
tsr_QC.6<- tslm(formula= train.QC[,6]~GEO+Sex, data=train.QC)
summary(tsr_QC.6)
## 
## Call:
## tslm(formula = train.QC[, 6] ~ GEO + Sex, data = train.QC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.410 -17.142  -5.573   7.205 299.661 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -154.4394    39.7898  -3.881 0.000136 ***
## GEO            2.4964     0.4519   5.524 9.08e-08 ***
## Sex           10.7403     3.0394   3.534 0.000497 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.59 on 226 degrees of freedom
## Multiple R-squared:  0.1609, Adjusted R-squared:  0.1535 
## F-statistic: 21.67 on 2 and 226 DF,  p-value: 2.452e-09
##tsr_QC.7<- tslm(formula= train.QC[,7]~GEO, data=train.QC)
##summary(tsr_QC.7)
#tsr_QC.8<- tslm(formula= train.QC[,8]~GEO, data=train.QC)
#summary(tsr_QC.8)

tsr_BC<- tslm(formula = train.BC[,4]~GEO+Sex, data = train.BC)
summary(tsr_BC)
## 
## Call:
## tslm(formula = train.BC[, 4] ~ GEO + Sex, data = train.BC)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.711  -47.297    9.945   45.519  150.076 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 427.4896    12.3623  34.580  < 2e-16 ***
## GEO          -0.1688     0.1171  -1.442    0.151    
## Sex          37.6058     4.9332   7.623 9.47e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.74 on 202 degrees of freedom
## Multiple R-squared:  0.229,  Adjusted R-squared:  0.2214 
## F-statistic:    30 on 2 and 202 DF,  p-value: 3.912e-12
tsr_BC.5<- tslm(formula = train.BC[,5]~GEO+Sex, data = train.BC)
summary(tsr_BC.5)
## 
## Call:
## tslm(formula = train.BC[, 5] ~ GEO + Sex, data = train.BC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.630  -8.844  -0.679   8.441  31.503 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.48881    2.71480  22.649  < 2e-16 ***
## GEO         -0.05301    0.02571  -2.062   0.0405 *  
## Sex          5.79429    1.08334   5.349 2.39e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.68 on 202 degrees of freedom
## Multiple R-squared:  0.1392, Adjusted R-squared:  0.1307 
## F-statistic: 16.33 on 2 and 202 DF,  p-value: 2.664e-07
tsr_BC.6<-tslm(formula = train.BC[,6]~GEO+Sex, data=train.BC)
summary(tsr_BC.6)
## 
## Call:
## tslm(formula = train.BC[, 6] ~ GEO + Sex, data = train.BC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.845  -7.052   1.992   6.410  26.953 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 52.10754    2.18667  23.830  < 2e-16 ***
## GEO         -0.01514    0.02071  -0.731    0.466    
## Sex          5.76268    0.87259   6.604 3.46e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.21 on 202 degrees of freedom
## Multiple R-squared:  0.1791, Adjusted R-squared:  0.171 
## F-statistic: 22.03 on 2 and 202 DF,  p-value: 2.209e-09
tsr_mid<-tslm(formula = train.mid[,4]~GEO+Sex, data=train.mid)
summary(tsr_mid)
## 
## Call:
## tslm(formula = train.mid[, 4] ~ GEO + Sex, data = train.mid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -112.80  -39.73   13.94   38.60  101.23 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 484.30711    9.40910  51.472  < 2e-16 ***
## GEO          -0.30077    0.08149  -3.691 0.000268 ***
## Sex          36.03721    3.59248  10.031  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.91 on 286 degrees of freedom
## Multiple R-squared:  0.2844, Adjusted R-squared:  0.2794 
## F-statistic: 56.84 on 2 and 286 DF,  p-value: < 2.2e-16
tsr_mid.5<-tslm(formula = train.mid[,5]~GEO+Sex, data=train.mid)
summary(tsr_mid.5)
## 
## Call:
## tslm(formula = train.mid[, 5] ~ GEO + Sex, data = train.mid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.634  -8.517  -2.382   6.140  75.202 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 67.83188    2.97618  22.792  < 2e-16 ***
## GEO         -0.03844    0.02578  -1.491    0.137    
## Sex          6.15756    1.13633   5.419 1.28e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.79 on 286 degrees of freedom
## Multiple R-squared:  0.09911,    Adjusted R-squared:  0.09281 
## F-statistic: 15.73 on 2 and 286 DF,  p-value: 3.295e-07
tsr_mid.6<- tslm(formula = train.mid[,6]~GEO+Sex, data=train.mid)
summary(tsr_mid.6)
## 
## Call:
## tslm(formula = train.mid[, 6] ~ GEO + Sex, data = train.mid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.983 -11.507   1.895   8.593  35.078 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 58.93722    2.44946  24.061  < 2e-16 ***
## GEO         -0.02100    0.02122  -0.990    0.323    
## Sex          7.11474    0.93523   7.608 4.06e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.99 on 286 degrees of freedom
## Multiple R-squared:  0.1704, Adjusted R-squared:  0.1646 
## F-statistic: 29.37 on 2 and 286 DF,  p-value: 2.504e-12
tsr_mar<-tslm(formula = train.mar[,4]~GEO+Sex, data= train.mar)
summary(tsr_mar)
## 
## Call:
## tslm(formula = train.mar[, 4] ~ GEO + Sex, data = train.mar)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -184.48  -64.61   16.89   53.30  112.02 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 473.150844  22.772138  20.778   <2e-16 ***
## GEO          -0.001946   0.170043  -0.011    0.991    
## Sex          50.093376   5.455674   9.182   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67.48 on 226 degrees of freedom
## Multiple R-squared:  0.2717, Adjusted R-squared:  0.2652 
## F-statistic: 42.15 on 2 and 226 DF,  p-value: 2.764e-16
tsr_mar.5<- tslm(formula = train.mar[,5]~GEO+Sex, data = train.mar)
summary(tsr_mar.5)
## 
## Call:
## tslm(formula = train.mar[, 5] ~ GEO + Sex, data = train.mar)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -42.55 -12.96   2.84  11.12  61.33 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 66.28167    5.51549  12.017  < 2e-16 ***
## GEO          0.01807    0.04119   0.439    0.661    
## Sex         11.10074    1.32138   8.401 4.94e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.34 on 226 degrees of freedom
## Multiple R-squared:  0.2384, Adjusted R-squared:  0.2317 
## F-statistic: 35.37 on 2 and 226 DF,  p-value: 4.314e-14
tsr_mar.6<- tslm(formula = train.mar[,6]~GEO+Sex, data = train.mar)
summary(tsr_mar.6)
## 
## Call:
## tslm(formula = train.mar[, 6] ~ GEO + Sex, data = train.mar)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.351 -10.640   2.387   8.646  28.017 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 60.71140    4.22475  14.370  < 2e-16 ***
## GEO          0.01491    0.03155   0.473    0.637    
## Sex          8.28765    1.01215   8.188 1.95e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.52 on 226 degrees of freedom
## Multiple R-squared:  0.2293, Adjusted R-squared:  0.2225 
## F-statistic: 33.62 on 2 and 226 DF,  p-value: 1.648e-13
tsr_terr<- tslm(formula = train.terr[,4]~GEO+Sex, data = train.terr)
summary(tsr_terr)
## 
## Call:
## tslm(formula = train.terr[, 4] ~ GEO + Sex, data = train.terr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -116.667  -27.504   -0.104   31.033  103.371 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   488.04      21.45  22.755  < 2e-16 ***
## GEO               NA         NA      NA       NA    
## Sex            27.36      10.04   2.727  0.00993 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.15 on 35 degrees of freedom
## Multiple R-squared:  0.1752, Adjusted R-squared:  0.1516 
## F-statistic: 7.434 on 1 and 35 DF,  p-value: 0.009932
tsr_terr.5<- tslm(formula = train.terr[,5]~GEO+Sex, data = train.terr)
summary(tsr_terr.5)
## 
## Call:
## tslm(formula = train.terr[, 5] ~ GEO + Sex, data = train.terr)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27.54 -10.51  -2.44  12.83  31.19 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   76.301      6.455  11.820 8.98e-14 ***
## GEO               NA         NA      NA       NA    
## Sex            5.070      3.021   1.678    0.102    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.09 on 35 degrees of freedom
## Multiple R-squared:  0.07449,    Adjusted R-squared:  0.04805 
## F-statistic: 2.817 on 1 and 35 DF,  p-value: 0.1022
tsr_terr.6<-tslm(formula = train.terr[,6]~GEO+Sex, data = train.terr)
##test predictions

newdata.1<- data.frame(
  ï..REF_DATE = test.ON[,1],
  GEO = test.ON[,2],
  Sex = test.ON[,3]
)
fcast.ON.4<-forecast(tsr_ON, newdata = newdata.1,h=2)
autoplot(fcast.ON.4)+
  autolayer(test.ON[,4])

newdata.2<- data.frame(
  GEO = test.ON[,2],
  Sex = test.ON[,3]
)
fcast.ON.5<-forecast(tsr_ON.5, newdata = newdata.2,h=2)
autoplot(fcast.ON.5)+
  autolayer(test.ON[,5])

newdata.3<- data.frame(
  GEO = test.ON[,2],
  Sex = test.ON[,3]
)
fcast.ON.6<-forecast(tsr_ON.6, newdata = newdata.3,h=2)
autoplot(fcast.ON.6, h=2)+
  autolayer(test.ON[,6])

newdata.1.1<- data.frame(
  GEO = test.QC[,2],
  Sex = test.QC[,3]
)
fcast.QC.4<-forecast(tsr_QC, newdata = newdata.1.1,h=2)
autoplot(fcast.QC.4)+
  autolayer(test.QC[,4])

newdata.2.1<- data.frame(
  GEO = test.QC[,2],
  Sex = test.QC[,3]
)
fcast.QC.5<-forecast(tsr_QC.5, newdata = newdata.2.1,h=2)
autoplot(fcast.QC.5)+
  autolayer(test.QC[,5])

newdata.3.1<- data.frame(
  GEO = test.QC[,2],
  Sex = test.QC[,3]
)
fcast.QC.6<-forecast(tsr_QC.6, newdata = newdata.3.1,h=2)
autoplot(fcast.QC.6)+
  autolayer(test.QC[,6])

newdata.1.2<- data.frame(
  GEO = test.BC[,2],
  Sex = test.BC[,3]
)
fcast.BC.4<-forecast(tsr_BC, newdata = newdata.1.2,h=2)
autoplot(fcast.BC.4)+
  autolayer(test.BC[,4])

newdata.2.2<- data.frame(
  GEO = test.BC[,2],
  Sex = test.BC[,3]
)
fcast.BC.5<-forecast(tsr_BC.5, newdata = newdata.2.2,h=2)
autoplot(fcast.BC.5)+
  autolayer(test.BC[,5])

newdata.3.2<- data.frame(
  GEO = test.BC[,2],
  Sex = test.BC[,3]
)
fcast.BC.6<-forecast(tsr_BC.6, newdata = newdata.3.2,h=2)
autoplot(fcast.BC.6)+
  autolayer(test.BC[,6])

newdata.1.3<- data.frame(
  GEO = test.mid[,2],
  Sex = test.mid[,3]
)
fcast.mid.4<-forecast(tsr_mid, newdata = newdata.1.3,h=2)
autoplot(fcast.mid.4)+
  autolayer(test.mid[,4])

newdata.2.3<- data.frame(
  GEO = test.mid[,2],
  Sex = test.mid[,3]
)
fcast.mid.5<-forecast(tsr_mid.5, newdata = newdata.2.3,h=2)
autoplot(fcast.mid.5)+
  autolayer(test.mid[,5])

newdata.3.3<- data.frame(
  GEO = test.mid[,2],
  Sex = test.mid[,3]
)
fcast.mid.6<-forecast(tsr_mid.6, newdata = newdata.3.3,h=2)
autoplot(fcast.mid.6)+
  autolayer(test.mid[,6])

newdata.1.4<- data.frame(
  GEO = test.mar[,2],
  Sex = test.mar[,3]
)
fcast.mar.4<-forecast(tsr_mar, newdata = newdata.1.4,h=2)
autoplot(fcast.mar.4)+
  autolayer(test.mar[,4])

newdata.2.4<- data.frame(
  GEO = test.mar[,2],
  Sex = test.mar[,3]
)
fcast.mar.5<-forecast(tsr_mar.5, newdata = newdata.2.4,h=2)
autoplot(fcast.mar.5)+
  autolayer(test.mar[,5])

newdata.3.4<- data.frame(
  GEO = test.mar[,2],
  Sex = test.mar[,3]
)
fcast.mar.6<-forecast(tsr_mar.6, newdata = newdata.3.4,h=2)
autoplot(fcast.mar.6)+
  autolayer(test.mar[,6])

newdata.1.5<- data.frame(
  GEO = test.terr[,2],
  Sex = test.terr[,3]
)
fcast.terr.4<-forecast(tsr_terr, newdata = newdata.1.5,h=2)
autoplot(fcast.terr.4)+
  autolayer(test.terr[,4])

newdata.2.5<- data.frame(
  GEO = test.terr[,2],
  Sex = test.terr[,3]
)
fcast.terr.5<-forecast(tsr_terr.5, newdata = newdata.2.5,h=2)
autoplot(fcast.terr.5)+
  autolayer(test.terr[,5])

newdata.3.5<- data.frame(
  GEO = test.terr[,2],
  Sex = test.terr[,3]
)
fcast.terr.6<-forecast(tsr_terr.6, newdata = newdata.3.5,h=2)
autoplot(fcast.terr.6)+
  autolayer(test.terr[,6])

##ARIMA model or SARIMA
##trying non seasonal ARIMA models
##with the currently non differenced data

##staring with auto.arima
####################All cancer sites
#(auto.arima(train.ON[,4], seasonal = TRUE))
#(Arima(train.ON[,4], order = c(1,0,5), seasonal = c(1,0,1)))#model from auto.arima
#AIC=6659.09   AICc=6659.45   BIC=6703.47

#(Arima(train.ON[,4], order = c(1,1,6), seasonal = c(0,1,1)))
#(arima_ON.m<- Arima(train.ON[,4], order = c(1,1,6), seasonal = c(1,1,1)))
#AIC=6127.88   AICc=6128.27   BIC=6171.37

#(arima_ON.m<- Arima(train.ON[,4], order = c(1,1,8), seasonal = c(0,1,1)))
#AIC=6089.67   AICc=6090.14   BIC=6137.51

arima_ON.m<- Arima(train.ON[,4], order = c(1,1,8), seasonal = c(0,1,2))####BEST AIC
#AIC=5865.88   AICc=5866.44   BIC=5918.07

#################Lungs
#(arima_ON.5<- auto.arima(train.ON[,5], seasonal = TRUE))
#(arima_ON.5<- Arima(train.ON[,5], order = c(1,0,5), seasonal = c(0,0,1)))#model from auto.arima
#AIC=5183.45   AICc=5183.75   BIC=5223.39

#(arima_ON.5.m<- Arima(train.ON[,5], order = c(1,0,5), seasonal = c(0,1,1)))
# AIC=4791.34   AICc=4791.6   BIC=4826.15

#(arima_ON.5.m<- Arima(train.ON[,5], order = c(1,0,7), seasonal = c(0,1,1))) 
#AIC=4742.87   AICc=4743.26   BIC=4786.38

#(arima_ON.5.m<- Arima(train.ON[,5], order = c(1,0,8), seasonal = c(0,1,2)))
#AIC=4506.99   AICc=4507.55   BIC=4559.2

arima_ON.5.m<- Arima(train.ON[,5], order = c(1,0,9), seasonal = c(0,1,2))##BEST AIC
#AIC=4497.12   AICc=4497.77   BIC=4553.68

#############################COLON
#(arima_ON.6<- auto.arima(train.ON[,6], seasonal = FALSE)) #False higher AIC than True
#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,5), seasonal = c(0,0,0)))#model from auto.arima
#AIC=4753.36   AICc=4753.59   BIC=4788.86

#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,6), seasonal = c(0,0,0)))
#AIC=4689.1   AICc=4689.39   BIC=4729.04

#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,5), seasonal = c(0,1,1)))
#AIC=4437.09   AICc=4437.35   BIC=4471.9

#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,6), seasonal = c(0,1,1)))
#AIC=4352.96   AICc=4353.28   BIC=4392.12

#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,7), seasonal = c(0,1,1)))
#AIC=4335.71   AICc=4336.1   BIC=4379.22

arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,8), seasonal = c(0,1,2))##BEST AIC
#AIC=4117.67   AICc=4118.23   BIC=4169.88
fcast_ON.4<-forecast(arima_ON.m)
autoplot(fcast_ON.4)+
  autolayer(test.ON[,4])

fcast_ON.5<-forecast(arima_ON.5.m)
autoplot(fcast_ON.5)+
  autolayer(test.ON[,5])

fcast_ON.6<-forecast(arima_ON.6.m)
autoplot(fcast_ON.6)+
  autolayer(test.ON[,6])

#######################All cancer sites
#(arima_QC<- auto.arima(train.QC[,4], seasonal = TRUE))
#(arima_QC<- Arima(train.QC[,4], order = c(1,0,1), seasonal = c(0,0,1)))##auto.arima model
#AIC=2813.12   AICc=2813.38   BIC=2830.28

#(arima_QC.m<- Arima(train.QC[,4], order = c(1,0,3), seasonal = c(0,0,1)))
#AIC=2799.16   AICc=2799.66   BIC=2823.19

#(arima_QC.m<- Arima(train.QC[,4], order = c(1,0,3), seasonal = c(0,1,1)))
#AIC=2570.48   AICc=2570.89   BIC=2590.56

#(arima_QC.m<- Arima(train.QC[,4], order = c(1,1,4), seasonal = c(0,1,1)))
#AIC=2568.64   AICc=2569.2   BIC=2592.04

arima_QC.m<- Arima(train.QC[,4], order = c(1,1,4), seasonal = c(0,1,2))##BEST AIC
#AIC=2526.05   AICc=2526.77   BIC=2552.79

##########################LUNGS
#(arima_QC.5<- auto.arima(train.QC[,5], seasonal = TRUE))
#(arima_QC.5<-Arima(train.QC[,5], order = c(0,0,1), seasonal = c(0,0,2)))##auto.arima model
#AIC=2407.76   AICc=2408.02   BIC=2424.92

#(arima_QC.5.m<-Arima(train.QC[,5], order = c(1,0,4), seasonal = c(0,1,2)))
#AIC=2190.36   AICc=2191.08   BIC=2217.14

#(arima_QC.5.m<-Arima(train.QC[,5], order = c(1,1,4), seasonal = c(0,1,2)))
#AIC=2188.35   AICc=2189.07   BIC=2215.09

#(arima_QC.5.m<-Arima(train.QC[,5], order = c(1,1,4), seasonal = c(0,1,3)))
#AIC=2178.95   AICc=2179.85   BIC=2209.03

#(arima_QC.5.m<-Arima(train.QC[,5], order = c(1,1,4), seasonal = c(0,1,4)))
#AIC=2125.93   AICc=2127.04   BIC=2159.36
arima_QC.5.m<-Arima(train.QC[,5], order = c(1,1,4), seasonal = c(0,1,5))
#AIC=2104.38   AICc=2105.72   BIC=2141.14

############################# Colon
#(arima_QC.6<- auto.arima(train.QC[,6], seasonal = TRUE))
#(arima_QC.6.m<-Arima(train.QC[,6], order = c(0,0,3), seasonal = c(0,0,1)))##auto.arima model
#AIC=2258.1   AICc=2258.48   BIC=2278.7

#(arima_QC.6.m<-Arima(train.QC[,6], order = c(4,0,3), seasonal = c(0,0,1)))
#AIC=2250.51   AICc=2251.52   BIC=2284.85

#(arima_QC.6.m<-Arima(train.QC[,6], order = c(1,0,3), seasonal = c(0,1,1)))
#AIC=2114.69   AICc=2115.11   BIC=2134.78

#(arima_QC.6.m<-Arima(train.QC[,6], order = c(1,0,3), seasonal = c(0,2,1)))
#AIC=2110.41   AICc=2110.86   BIC=2129.92

#(arima_QC.6.m<-Arima(train.QC[,6], order = c(1,0,3), seasonal = c(0,2,4)))
#AIC=2005.74   AICc=2006.74   BIC=2035.01

arima_QC.6.m<-Arima(train.QC[,6], order = c(1,0,3), seasonal = c(0,2,5))##BEST AIC
#AIC=1980.22   AICc=1981.44   BIC=2012.74
fcast_QC.4<-forecast(arima_QC.m)
autoplot(fcast_QC.4)+
  autolayer(test.QC[,4])

fcast_QC.5<-forecast(arima_QC.5.m)
autoplot(fcast_QC.5)+
  autolayer(test.QC[,5])

fcast_QC.6<-forecast(arima_QC.6.m)
autoplot(fcast_QC.6)+
  autolayer(test.QC[,6])

##############################ALL cancer sites
#(arima_BC<-auto.arima(train.BC[,4], seasonal = TRUE))
#(arima_BC<- Arima(train.BC[,4], order = c(1,0,5), seasonal = c(0,0,1)))#auto.arima model
#AIC=2187.68   AICc=2188.6   BIC=2217.58

#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,5), seasonal = c(0,1,1)))
#AIC=2041.8   AICc=2042.6   BIC=2067.69

#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,6), seasonal = c(0,1,1)))
#AIC=2020.01   AICc=2021.02   BIC=2049.14

#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,2)))
#AIC=1917.45   AICc=1919.54   BIC=1959.52

#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,3)))
#AIC=1906.1   AICc=1908.53   BIC=1951.41

#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,4)))
#AIC=1847.06   AICc=1849.85   BIC=1895.61

#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,5)))
#AIC=1823.34   AICc=1826.52   BIC=1875.12

#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,6)))
#AIC=1815.42   AICc=1819.02   BIC=1870.44

#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,7)))
#AIC=1784.21   AICc=1788.26   BIC=1842.46

arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,8))
#AIC=1779.17   AICc=1783.69   BIC=1840.66
##takes forever to run

#############################################LUNGS
#(arima_BC.5<-auto.arima(train.BC[,5], seasonal = FALSE))##better than seasonal=TRUE
#AIC=1531.24   AICc=1531.98   BIC=1557.82

#(arima_BC.5.m<- Arima(train.BC[,5], order = c(2,0,4), seasonal = c(0,1,1)))
#AIC=1358.72   AICc=1359.53   BIC=1384.62

#(arima_BC.5.m<- Arima(train.BC[,5], order = c(2,0,4), seasonal = c(1,1,1)))
#AIC=1334.85   AICc=1335.86   BIC=1363.98

#(arima_BC.5.m<- Arima(train.BC[,5], order = c(2,0,4), seasonal = c(1,1,4)))
#AIC=1242.85   AICc=1244.64   BIC=1281.69

arima_BC.5.m<- Arima(train.BC[,5], order = c(2,0,4), seasonal = c(2,1,4))###BEST AIC
#AIC=1148.16   AICc=1150.25   BIC=1190.23

################################# Colon
#(arima_BC.6<-auto.arima(train.BC[,6], seasonal = TRUE))
#(arima_BC.6.m<- Arima(train.BC[,6], order = c(1,0,5), seasonal = c(0,0,2)))#auto.arima model
#AIC=1444.7   AICc=1445.83   BIC=1477.93

#(arima_BC.6.m<- Arima(train.BC[,6], order = c(1,0,5), seasonal = c(0,1,2)))
#AIC=1361.42   AICc=1362.43   BIC=1390.55

#(arima_BC.6.m<- Arima(train.BC[,6], order = c(1,0,5), seasonal = c(0,1,4)))
#AIC=1302.51   AICc=1304.01   BIC=1338.11

arima_BC.6.m<- Arima(train.BC[,6], order = c(1,0,5), seasonal = c(0,1,5))##BEST AIC
#AIC=1295.83   AICc=1297.62   BIC=1334.67
fcast_BC.4<-forecast(arima_BC.m)
autoplot(fcast_BC.4)+
  autolayer(test.BC[,4])

fcast_BC.5<-forecast(arima_BC.5.m)
autoplot(fcast_BC.5)+
  autolayer(test.BC[,5])

fcast_BC.6<-forecast(arima_BC.6.m)
autoplot(fcast_BC.6)+
  autolayer(test.BC[,6])

##################################ALL cancer sites
#(arima_mid<- auto.arima(train.mid[,4], seasonal= TRUE))
#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,2), seasonal = c(1,1,0)))#auto.arima model
#AIC=2581.3   AICc=2581.62   BIC=2602.77

#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,1,0)))
#AIC=2577.45   AICc=2577.88   BIC=2602.51

#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,1,1)))
#AIC=2555.31   AICc=2555.87   BIC=2583.95

#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,2,1)))
#AIC=2442.21   AICc=2442.83   BIC=2470.09

#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,2,3)))
#AIC=2419.11   AICc=2420.07   BIC=2453.96

#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,2,5)))
#AIC=2412.21   AICc=2413.58   BIC=2454.02

arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,2,6))##no noticeable improvement from 6->8 
#AIC=2405.48   AICc=2407.08   BIC=2450.78

###########################################LUNGS
#(arima_mid.5<- auto.arima(train.mid[,5], seasonal= TRUE))
#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,2), seasonal = c(2,0,1)))##auto.arima model
#AIC=2239.1   AICc=2240.24   BIC=2283.06

#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,3), seasonal = c(2,0,1)))
#AIC=2169.99   AICc=2171.12   BIC=2213.94

#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,3), seasonal = c(2,1,1)))
#AIC=2022.71   AICc=2023.95   BIC=2065.62

#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,3), seasonal = c(2,1,2)))
#AIC=2020.75   AICc=2022.2   BIC=2067.23

#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,3), seasonal = c(2,2,2)))
#AIC=2000.26   AICc=2001.87   BIC=2045.51

#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,2,3), seasonal = c(2,2,2)))
#AIC=1991.14   AICc=1992.75   BIC=2036.33 ##also solved warning in previous version 

arima_mid.5.m<- Arima(train.mid[,5], order = c(5,2,4), seasonal = c(2,2,2))##BEST AIC
#AIC=1971.84   AICc=1973.72   BIC=2020.52

########################################## Colon
#(arima_mid.6<- auto.arima(train.mid[,6], seasonal= TRUE))
#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,0,0), seasonal = c(2,1,1))) ##auto.arima model
#AIC=1904.11   AICc=1904.55   BIC=1929.17

#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,0,2), seasonal = c(2,1,1)))
#AIC=1898.68   AICc=1899.24   BIC=1927.32

#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,1,2), seasonal = c(2,1,1)))
#AIC=1891.71   AICc=1892.28   BIC=1920.32

#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,1,2), seasonal = c(2,1,2)))
#AIC=1887.73   AICc=1888.44   BIC=1919.92

#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,1,2), seasonal = c(2,2,2)))
#AIC=1851.87   AICc=1852.65   BIC=1883.19

arima_mid.6.m<- Arima(train.mid[,6], order = c(2,1,2), seasonal = c(2,3,2)) ##BEST AIC
#AIC=1840.36   AICc=1841.23   BIC=1870.74
fcast_mid.4<-forecast(arima_mid.m)
autoplot(fcast_mid.4)+
  autolayer(test.mid[,4])

fcast_mid.5<-forecast(arima_mid.5.m)
autoplot(fcast_mid.5)+
  autolayer(test.mid[,5])

fcast_mid.6<-forecast(arima_mid.6.m)
autoplot(fcast_mid.6) +
  autolayer(test.mid[,6])

#(arima_mar<- auto.arima(train.mar[,4], seasonal = TRUE))
#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,0) ,seasonal = c(0,0,2))) #auto.arima model
#AIC=2201.95   AICc=2202.61   BIC=2229.38

#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,1) ,seasonal = c(0,0,2)))
#AIC=2198.94   AICc=2199.6   BIC=2226.37

#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,3) ,seasonal = c(0,0,2)))
#AIC=2131.73   AICc=2132.74   BIC=2166.02

#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,3) ,seasonal = c(0,1,2)))
#AIC=2002.72   AICc=2003.83   BIC=2036.14

#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,3) ,seasonal = c(0,2,2)))
#AIC=1916.7   AICc=1917.93   BIC=1949.17

arima_mar.m <- Arima(train.mar[,4], order = c(4,1,3) ,seasonal = c(0,2,3))## BEST AIC
#AIC=1915.16   AICc=1916.65   BIC=1950.88

######################################LUNGS
#(arima_mar.5<- auto.arima(train.mar[,5], seasonal = TRUE))
#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,1), seasonal = c(0,0,2)))##auto.arima model
#AIC=1663.09   AICc=1664.1   BIC=1697.42

#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,1), seasonal = c(0,1,2)))
#AIC=1581.21   AICc=1582.11   BIC=1611.33

#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,2), seasonal = c(0,1,2)))
#AIC=1558.89   AICc=1559.99   BIC=1592.36

#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,2), seasonal = c(0,2,2)))
#AIC=1516.86   AICc=1518.08   BIC=1549.38

#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,2), seasonal = c(0,3,2)))
#AIC=1474.06   AICc=1475.43   BIC=1505.54

arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,3), seasonal = c(0,3,2))##BEST AIC
#AIC=1472.04   AICc=1473.69   BIC=1506.66

############################### Colon
#(arima_mar.6<- auto.arima(train.mar[,6], seasonal = TRUE))
#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,5), seasonal = c(0,0,1))) #auto.arima model
#AIC=1668.67   AICc=1669.33   BIC=1696.1

#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,5), seasonal = c(0,1,1)))
#AIC=1596.2   AICc=1596.75   BIC=1619.59

#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,6), seasonal = c(0,1,1)))
#AIC=1566.62   AICc=1567.34   BIC=1593.36

#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,1,4)))
#AIC=1497.4   AICc=1499.89   BIC=1547.53

#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,1,5)))
#AIC=1491.06   AICc=1493.89   BIC=1544.54

#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,2,5)))
#AIC=1471.18   AICc=1474.32   BIC=1523.13

#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,3,5)))
#AIC=1469.12   AICc=1472.65   BIC=1519.38

arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,3,6))##BEST AIC
#AIC=1462.55   AICc=1466.55   BIC=1515.95
fcast_mar.4<-forecast(arima_mar.m)
autoplot(fcast_mar.4) +
  autolayer(test.mar[,4])

fcast_mar.5<-forecast(arima_mar.5.m)
autoplot(fcast_mar.5) +
  autolayer(test.mar[,5])

fcast_mar.6<-forecast(arima_mar.6.m)
autoplot(fcast_mar.6) +
  autolayer(test.mar[,6])

#####################ALL cancer sites
#(arima_terr<-auto.arima(train.terr[,4], seasonal = TRUE, stepwise = FALSE, approximation = FALSE))
arima_terr.m<- Arima(train.terr[,4], order = c(0,0,0), seasonal = c(0,1,0))##auto.arima model
#AIC=347.42   AICc=347.55   BIC=348.95
##BEST AIC

############################### LUNGS
#(arima_terr.5<-auto.arima(train.terr[,5], seasonal = TRUE))
arima_terr.5.m<- Arima(train.terr[,5], order= c(0,0,0), seasonal = c(2,0,0))##auto.arima model
#AIC=283.85   AICc=285.1   BIC=290.29
##BEST AIC

################################# COlon
#(arima_terr.6<-auto.arima(train.terr[,6], seasonal = TRUE))
#>  ARIMA(2,1,0) 
#>  AIC=346.54   AICc=347.29   BIC=351.29
#(arima_terr.6.m<- Arima(train.terr[,5], order= c(2,1,1), seasonal = c(0,0,0)))
#AIC=287.9   AICc=289.19   BIC=294.24

#(arima_terr.6.m<- Arima(train.terr[,5], order= c(2,1,1), seasonal = c(0,1,0)))
#AIC=266.43   AICc=267.86   BIC=272.42

arima_terr.6.m<- Arima(train.terr[,5], order= c(2,1,1), seasonal = c(0,2,0))##BEST AIC
#AIC=258.47   AICc=260.07   BIC=264.07
fcast_terr.4<-forecast(arima_terr.m)
autoplot(fcast_terr.4) +
  autolayer(test.terr[,4])

fcast_terr.5<-forecast(arima_terr.5.m)
autoplot(fcast_terr.5) +
  autolayer(test.terr[,5])

fcast_terr.6<-forecast(arima_terr.6.m)
autoplot(fcast_terr.6) +
  autolayer(test.terr[,6])

##forecast function
f.ON<- function(y, h){forecast(arima_ON.m, h=h)}
##explore Exogenous predictor variables with xreg
#compare RMSE
on.1<-tsCV(ts_ON.2[,4], f.ON, h=2)
sqrt(mean(on.1^2, na.rm= TRUE))
## [1] 84.76661
##[1] 84.76661
sqrt(mean(residuals(f.ON(ts_ON.2, h=2))^2, na.rm=TRUE))
## [1] 30.27293
####[1] 30.27293

######lungs forcast
f.ON.5<-function(y, h){forecast(arima_ON.5.m, h=h)}
on.2<-tsCV(ts_ON.2[,5], f.ON.5, h=2)
sqrt(mean(on.2^2, na.rm= TRUE))
## [1] 19.05896
##[1] 19.05896
sqrt(mean(residuals(f.ON.5(ts_ON.2[,5], h=2))^2, na.rm=TRUE))
## [1] 9.144005
#[1] 9.144005
#####colon
f.ON.6<-function(y, h){forecast(arima_ON.6.m, h=h)}
on.3<-tsCV(ts_ON.2[,6], f.ON.6, h=2)
sqrt(mean(on.3^2, na.rm= TRUE))
## [1] 15.02973
##[1] 15.02973
sqrt(mean(residuals(f.ON.6(ts_ON.2[,6], h=2))^2, na.rm=TRUE))
## [1] 6.616241
###[1] 6.616241
#####################ARIMA

accuracy(fcast_ON.4, test.ON[,4])
##                      ME     RMSE      MAE        MPE    MAPE      MASE
## Training set -0.3357554 30.27293 23.43676 -0.5624577 4.55796 0.2358884
## Test set     -1.4377948 53.13471 44.33229 -1.4543186 8.54579 0.4461995
##                    ACF1 Theil's U
## Training set -0.1072211        NA
## Test set     -0.3663429 0.4820504
accuracy(fcast_ON.5, test.ON[,5])
##                      ME     RMSE      MAE       MPE      MAPE      MASE
## Training set  0.1860213 9.144005 6.693404 -1.711729  9.932586 0.3072607
## Test set     -0.7983518 9.947122 7.837666 -3.390988 11.725109 0.3597881
##                    ACF1 Theil's U
## Training set -0.0137609        NA
## Test set      0.0532865 0.5794236
accuracy(fcast_ON.6, test.ON[,6])
##                      ME     RMSE      MAE        MPE      MAPE      MASE
## Training set  0.2270698 6.616241 4.918528 -0.9328313  7.405991 0.2734132
## Test set     -1.4757116 9.968210 7.966383 -4.5149638 12.613496 0.4428388
##                     ACF1 Theil's U
## Training set -0.02254191        NA
## Test set     -0.36294868 0.4776273
################TSLM
accuracy(fcast.ON.4, test.ON[,4])
##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -2.732470e-15 60.86150 52.47408 -1.3887475 10.37529 0.5281456
## Test set      7.725614e+00 60.60299 54.77697  0.1155038 10.64216 0.5513240
##                    ACF1 Theil's U
## Training set -0.1614651        NA
## Test set     -0.2009413  0.547298
accuracy(fcast.ON.5, test.ON[,5])
##                        ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 8.288981e-16 16.97424 13.27611 -5.976449 20.35705 0.6094400
## Test set     1.941021e-01 12.61792 10.04751 -3.399509 15.62458 0.4612312
##                   ACF1 Theil's U
## Training set 0.2514777        NA
## Test set     0.3068294 0.8082756
accuracy(fcast.ON.6, test.ON[,6])
##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -4.091172e-16 12.13703 9.895152 -3.169308 15.19297 0.5500560
## Test set     -4.956624e-01 11.21434 9.422390 -3.817739 15.13001 0.5237759
##                     ACF1 Theil's U
## Training set -0.05051096        NA
## Test set     -0.18129412 0.5417806

NOTE: MASE all are <1

#####################ARIMA
accuracy(fcast_QC.4, test.QC[,4])
##                     ME     RMSE      MAE         MPE     MAPE      MASE
## Training set -3.402105 81.03498 58.78771 -2.11893804 9.923888 0.4261876
## Test set      5.704239 69.29902 56.19228 -0.05398666 9.875326 0.4073718
##                     ACF1 Theil's U
## Training set -0.01218758        NA
## Test set     -0.16173147 0.5462605
accuracy(fcast_QC.5, test.QC[,5])
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.7977578 24.50994 16.87430      -Inf      Inf 0.2848762
## Test set     -1.0018107 21.84940 16.00479 -3.434945 15.49545 0.2701968
##                      ACF1 Theil's U
## Training set -0.004871119        NA
## Test set      0.140461623  0.428896
accuracy(fcast_QC.6, test.QC[,6])
##                     ME     RMSE       MAE       MPE     MAPE      MASE
## Training set -1.380253 25.09385 13.542098 -4.913560 15.87571 0.3871488
## Test set     -2.787314 11.30498  8.680159 -5.462049 12.75794 0.2481531
##                      ACF1 Theil's U
## Training set -0.002606181        NA
## Test set     -0.103577339 0.5255439
################TSLM
accuracy(fcast.QC.4, test.QC[,4])
##                         ME      RMSE      MAE       MPE      MAPE      MASE
## Training set  7.460068e-16 104.52210 75.16150 -2.629721 12.907732 0.5448911
## Test set     -4.602742e+00  65.33167 53.35346 -2.118524  9.709814 0.3867914
##                    ACF1 Theil's U
## Training set  0.1988024        NA
## Test set     -0.1889286 0.5185937
accuracy(fcast.QC.5, test.QC[,5])
##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -9.235016e-16 51.74001 33.52550      -Inf      Inf 0.5659859
## Test set     -5.286283e+00 27.38949 22.31059 -11.57328 24.18798 0.3766530
##                    ACF1 Theil's U
## Training set  0.4051380        NA
## Test set     -0.0446594 0.5712133
accuracy(fcast.QC.6, test.QC[,6])
##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  9.636578e-16 37.34677 20.42861 -8.449642 22.82466 0.5840241
## Test set     -5.562955e+00 14.44000 11.31650 -9.841291 17.28957 0.3235223
##                   ACF1 Theil's U
## Training set 0.2982451        NA
## Test set     0.1847868 0.6478085
#####################ARIMA
accuracy(fcast_BC.4, test.BC[,4])
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -1.527825 14.98964 11.51634 -0.5274741 2.391969 0.1231133
## Test set     12.934360 30.71271 21.95231  2.0028823 4.079514 0.2346772
##                     ACF1 Theil's U
## Training set -0.01966609        NA
## Test set     -0.21532894 0.3046769
accuracy(fcast_BC.5, test.BC[,5])
##                      ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -0.2626233 3.532935 2.614178 -0.6773540 3.806916 0.1508118
## Test set      0.5055518 5.768628 3.911727 -0.1145212 5.622419 0.2256673
##                     ACF1 Theil's U
## Training set -0.03900797        NA
## Test set      0.35746108 0.3048858
accuracy(fcast_BC.6, test.BC[,6])
##                     ME     RMSE      MAE        MPE      MAPE      MASE
## Training set 0.4600199 5.503453 4.245021 -0.3776554  6.994721 0.2647857
## Test set     0.1792858 7.680355 6.341853 -1.3306671 10.103317 0.3955768
##                     ACF1 Theil's U
## Training set  0.02153032        NA
## Test set     -0.12049978 0.4498501
################TSLM
accuracy(fcast.BC.4, test.BC[,4])
##                        ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 5.540284e-16 57.31390 48.83758 -1.386427 10.21451 0.5220895
## Test set     8.112898e+00 59.73265 52.53303  0.249694 10.59902 0.5615950
##                    ACF1 Theil's U
## Training set -0.1743779        NA
## Test set     -0.1910338 0.5457354
accuracy(fcast.BC.5, test.BC[,5])
##                        ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 2.010217e-15 12.58629 10.34324 -3.370094 15.50199 0.5967012
## Test set     3.019499e-01 13.83539 10.87916 -3.181984 15.94336 0.6276181
##                   ACF1 Theil's U
## Training set 0.1904812        NA
## Test set     0.4598499  0.781279
accuracy(fcast.BC.6, test.BC[,6])
##                        ME      RMSE      MAE        MPE     MAPE      MASE
## Training set 6.925355e-17 10.137763 8.143271 -2.8533769 13.95716 0.5079413
## Test set     9.665315e-01  9.440846 8.249721 -0.7821131 13.69748 0.5145811
##                    ACF1 Theil's U
## Training set -0.1029082        NA
## Test set     -0.1487001 0.4998617
#####################ARIMA
accuracy(fcast_mid.4, test.mid[,4])
##                      ME     RMSE      MAE        MPE     MAPE      MASE
## Training set   2.846237 24.62107 17.02520  0.3594439 3.180817 0.5353263
## Test set     -13.350272 24.62355 21.39882 -2.4703575 4.015196 0.6728470
##                      ACF1 Theil's U
## Training set -0.008567216        NA
## Test set      0.261144498 0.2343431
accuracy(fcast_mid.5, test.mid[,5])
##                       ME     RMSE      MAE        MPE      MAPE      MASE
## Training set  0.04218503 9.173924 6.403709 -0.5009628  8.460447 0.4780637
## Test set     -6.69998552 9.846109 7.976579 -9.9995206 11.864569 0.5954851
##                     ACF1 Theil's U
## Training set -0.02733384        NA
## Test set      0.19375542 0.7186801
accuracy(fcast_mid.6, test.mid[,6])
##                      ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -0.2642139 9.413829 6.084825 -0.6310913 8.881973 0.6569747
## Test set      1.8358503 7.256528 5.450974  2.8121503 8.141033 0.5885382
##                      ACF1 Theil's U
## Training set  0.004247999        NA
## Test set     -0.351646059  0.425444
################TSLM
accuracy(fcast.mid.4, test.mid[,4])
##                         ME     RMSE      MAE        MPE     MAPE     MASE
## Training set -1.190077e-15 49.64674 42.49145 -0.8846553 8.165776 1.336067
## Test set     -2.096376e+01 54.83271 40.74082 -5.0178291 8.442828 1.281021
##                    ACF1 Theil's U
## Training set -0.2685497        NA
## Test set     -0.3980486 0.4919871
accuracy(fcast.mid.5, test.mid[,5])
##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -7.071129e-16 15.70369 10.94078  -3.523441 14.09613 0.8167749
## Test set     -9.716891e+00 12.99978 10.33062 -15.868500 16.65156 0.7712239
##                    ACF1 Theil's U
## Training set 0.25195557        NA
## Test set     0.01659973 0.9038385
accuracy(fcast.mid.6, test.mid[,6])
##                         ME     RMSE       MAE        MPE     MAPE     MASE
## Training set -1.479317e-16 12.92446 10.767460  -3.434188 16.00222 1.162556
## Test set     -9.464619e+00 13.12395  9.855274 -17.589116 18.14554 1.064068
##                     ACF1 Theil's U
## Training set -0.02792106        NA
## Test set     -0.32190182 0.6785667
#####################ARIMA
accuracy(fcast_mar.4, test.mar[,4])
##                      ME     RMSE      MAE         MPE     MAPE      MASE
## Training set  0.3748883 26.79171 19.22015 -0.03847801 3.467458 0.1669458
## Test set     -1.7818321 45.60905 38.05424 -1.11150570 7.069351 0.3305383
##                     ACF1 Theil's U
## Training set -0.00384057        NA
## Test set     -0.06842116 0.2647759
accuracy(fcast_mar.5, test.mar[,5])
##                      ME     RMSE       MAE         MPE      MAPE      MASE
## Training set  0.6745681 10.36877  7.255549   0.3295996  8.282918 0.2955188
## Test set     -6.4261880 32.74793 26.085511 -16.0184045 32.872583 1.0624638
##                      ACF1 Theil's U
## Training set  0.006177642        NA
## Test set     -0.384584439 0.5029747
accuracy(fcast_mar.6, test.mar[,6])
##                     ME      RMSE       MAE        MPE      MAPE      MASE
## Training set 0.0843151  8.527493  5.868156 -0.5208992  7.831055 0.2892413
## Test set     4.7871196 23.601140 17.650280  3.1251743 25.629737 0.8699821
##                     ACF1 Theil's U
## Training set -0.08125448        NA
## Test set     -0.34709521  1.035569
################TSLM
accuracy(fcast.mar.4, test.mar[,4])
##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  1.719709e-15 67.03822 58.60668 -1.436155 10.69647 0.5090563
## Test set     -2.785333e+01 84.00296 67.07349 -7.578495 13.58759 0.5825988
##                    ACF1 Theil's U
## Training set -0.3164878        NA
## Test set     -0.3764124  0.499807
accuracy(fcast.mar.5, test.mar[,5])
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 0.000000 16.23688 13.01931 -3.424512 15.59274 0.5302771 -0.3400050
## Test set     2.736364 27.33734 23.42896 -8.155137 29.66163 0.9542623 -0.4326846
##              Theil's U
## Training set        NA
## Test set     0.4822365
accuracy(fcast.mar.6, test.mar[,6])
##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -1.731306e-15 12.43713 10.51453  -2.609385 14.11454 0.5182612
## Test set     -9.998602e+00 14.85506 11.52174 -17.859130 19.63211 0.5679064
##                    ACF1 Theil's U
## Training set -0.1458899        NA
## Test set     -0.3400125 0.5747104
#####################ARIMA
accuracy(fcast_terr.4, test.terr[,4])
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -6.120165 37.28218 28.90686 -1.3954971 5.494962 0.9204283
## Test set      3.516667 38.77106 35.61667  0.0400974 6.799237 1.1340763
##                     ACF1 Theil's U
## Training set  0.09744538        NA
## Test set     -0.23667890 0.3825451
accuracy(fcast_terr.5, test.terr[,5])
##                      ME      RMSE      MAE         MPE     MAPE      MASE
## Training set  0.4571086  9.578816  7.40855  -0.7693159  8.82341 0.7928571
## Test set     -5.4459145 17.358887 12.83183 -12.0646412 19.72843 1.3732517
##                      ACF1 Theil's U
## Training set -0.003140624        NA
## Test set     -0.486631480 0.5206454
accuracy(fcast_terr.6, test.terr[,6])
##                    ME     RMSE      MAE      MPE     MAPE     MASE        ACF1
## Training set 1.960658 13.43061 10.29128 2.784172 12.18069 1.101364 -0.04762778
## Test set     3.554966 18.44070 16.41814 4.896150 20.91724 1.757056 -0.58049364
##              Theil's U
## Training set        NA
## Test set      1.217407
################TSLM
accuracy(fcast.terr.4, test.terr[,4])
##                         ME     RMSE      MAE        MPE      MAPE     MASE
## Training set  1.535708e-14 48.77680 37.20890 -0.8171954  6.964245 1.184775
## Test set     -3.068323e+01 64.19984 50.57348 -7.2725797 10.430766 1.610319
##                     ACF1 Theil's U
## Training set  0.05067287        NA
## Test set     -0.31542376 0.5129719
accuracy(fcast.terr.5, test.terr[,5])
##                         ME     RMSE      MAE        MPE     MAPE     MASE
## Training set  9.603335e-16 14.68081 12.68075  -2.962001 15.24828 1.357083
## Test set     -8.073052e+00 17.13564 12.37991 -15.555256 19.88131 1.324888
##                    ACF1 Theil's U
## Training set -0.2043604        NA
## Test set     -0.3772719  0.452088
accuracy(fcast.terr.6, test.terr[,6])
##                         ME     RMSE      MAE        MPE     MAPE     MASE
## Training set -1.533519e-15 33.73368 23.19238  -6.106848 19.55314 1.183285
## Test set     -3.335152e+01 34.99038 33.35152 -42.337592 42.33759 1.701608
##                     ACF1 Theil's U
## Training set -0.07129035        NA
## Test set     -0.17748632  2.061652